library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(DT)

Parameters

DIST_METHOD = 'Manhattan'
errMethod = ifelse(DIST_METHOD == 'Manhattan','MAE','RMSE')

#weightFormula = function(x){ 1/(x)} #inverse of distance
weightFormula = function(x){ 1/(x ^ 2)} #inverse of sq distance

# Keep only required MAC Addresses
keepMacs = c(
  '00:0f:a3:39:e1:c0', #default
  #'00:0f:a3:39:dd:cd', #added
  '00:14:bf:b1:97:8a',
  '00:14:bf:3b:c7:c6',
  '00:14:bf:b1:97:90',
  '00:14:bf:b1:97:8d',
  '00:14:bf:b1:97:81'
)

Files

offline_file = "../../data/offline.final.trace.txt"
online_file = "../../data/online.final.trace.txt"

Functions (Process Raw Data)

# Create a function to parse the data
processLine = function(x){
  # Split up the line on ';', '=' and ','
  tokens = strsplit(x, "[;=,]")[[1]]
  
  # The hand held device (the one for which we need to determine the position)
  # infromation is contained in the 1st 10 tokens (refer to book page 9)
  # If no scanned signal values, return NULL
  if (length(tokens) == 10) {
    return(NULL)
  }
  
  # The tokens after the 10th one representthe signal strength at the access points (book page 9). 
  # Split up the tokens into individual measurements (each measurement contains 4 data points)
  # 4 points are: MAC address, Signal, Channel and Device Type
  # Device Type 3 is what is important (book page 6)
  tmp = matrix(data = tokens[ - (1:10) ], ncol = 4, byrow = TRUE)
  
  # Combine signal measurement with the h
  cbind(matrix(tokens[c(2, 4, 6:8, 10)], nrow(tmp), 6, byrow = TRUE), tmp)
}
#' @description Function to read the data, clean it and process it into an appropriate format
#' @param file Filename to be read in
#' @param keepMacs a list of MAC addresses to keep
#' @returns A dataframe 
readData = function(file, keepMacs=NULL){
  # Read in the raw "offline" text file
  txt = readLines(file)
  
  ##############################
  #### Process the raw data ####
  ##############################
  
  # Parse the data
  lines = txt[substr(txt, 1, 1) != "#" ]
  tmp = lapply(lines, processLine)
  
  # Convert the data to a data frame
  data = as.data.frame(do.call("rbind", tmp), stringsAsFactors = FALSE)

  ######################################################################
  #### Cleaning the Data and Building a Representation for Analysis ####
  ######################################################################
  
  # Assign column names to the offline data frame
  names(data) = c(
    "time", "scanMac", "posX", "posY", "posZ",
    "orientation", "mac", "signal",
    "channel", "type"
  )
  
  numVars = c("time", "posX", "posY", "posZ", "orientation", "signal")
  data[numVars] = lapply(data[numVars], as.numeric)
  
  # Keep only required device types (remove adhoc)
  data = data[data$type != 1, ]

  # Keep only required MAC Addresses
  data = data[data$mac %in% keepMacs, ]
  
  # # From book page 13
  # data$rawTime = data$time
  # data$time = data$time/1000
  # class(data$time) = c("POSIXt", "POSIXct")
  
  # Discard unwanted columns that dont add any additional information
  data = data[ , !(names(data) %in% c("scanMac", "posZ"))]

  # Cleanup Orientation
  data$angle = roundOrientation(data$orientation)
  
  # Add position identifier 
  data$posXY = paste(data$posX, data$posY, sep = "-")

  return(data)
}

Offline data

numMacs = length(keepMacs)
numMacs
## [1] 6
roundOrientation = function(angles) {
  refs = seq(0, by = 45, length = 9)
  q = sapply(angles, function(o) which.min(abs(o - refs)))
    c(refs[1:8], 0)[q]
  }
offline = readData(file = offline_file, keepMacs = keepMacs)
dim(offline)
## [1] 769332     10
length(unique(offline$posXY))
## [1] 166

Online Data

online = readData(file = online_file, keepMacs = keepMacs)
dim(online)
## [1] 34778    10
length(unique(online$posXY))
## [1] 60

Function (Reshape)

# This is equivalent to the tall2wide function 
reshapeSS = function(data, varSignal = "signal", keepVars = c("posXY", "posX", "posY"), sampleAngle = FALSE) {
  refs = seq(0, by = 45, length = 8)
  byLocation =
  with(
    data,
    by(
      data,
      list(posXY),
      function(x) {
        if (sampleAngle) x = x[x$angle == sample(refs, size = 1), ]
        ans = x[1, keepVars]
        avgSS = tapply(x[ , varSignal ], x$mac, mean)
        y = matrix(avgSS, nrow = 1, ncol = numMacs,
        dimnames = list(ans$posXY, names(avgSS)))
        cbind(ans, y)
      }
    )
  )
  newDataSS = do.call("rbind", byLocation)
  return(newDataSS)
}

Reshape Test Data

keepVars = c("posXY", "posX","posY", "orientation", "angle")
onlineSummary = reshapeSS(data = online, varSignal = "signal", keepVars = keepVars)
head(onlineSummary,10)
##                 posXY posX  posY orientation angle 00:0f:a3:39:e1:c0
## 0-0.05         0-0.05 0.00  0.05       130.5   135         -52.22727
## 0.15-9.42   0.15-9.42 0.15  9.42       112.3    90         -55.27523
## 0.31-11.09 0.31-11.09 0.31 11.09       230.1   225         -51.70909
## 0.47-8.2     0.47-8.2 0.47  8.20         5.8     0         -49.50000
## 0.78-10.94 0.78-10.94 0.78 10.94       348.3     0         -53.26364
## 0.93-11.69 0.93-11.69 0.93 11.69       158.3   180         -57.96364
## 1.08-12.19 1.08-12.19 1.08 12.19       229.1   225         -54.82727
## 1.24-3.93   1.24-3.93 1.24  3.93       261.5   270         -56.47273
## 1.39-6.61   1.39-6.61 1.39  6.61       114.1   135         -51.28182
## 1.52-9.32   1.52-9.32 1.52  9.32         7.0     0         -50.36697
##            00:14:bf:3b:c7:c6 00:14:bf:b1:97:81 00:14:bf:b1:97:8a
## 0-0.05             -62.94898         -61.81395         -40.06897
## 0.15-9.42          -73.96190         -72.70103         -47.81308
## 0.31-11.09         -70.08247         -70.09890         -54.08824
## 0.47-8.2           -64.25806         -72.59770         -45.65289
## 0.78-10.94         -66.96000         -66.80952         -48.41379
## 0.93-11.69         -70.44340         -70.58025         -43.66346
## 1.08-12.19         -69.20192         -67.92553         -52.00820
## 1.24-3.93          -69.62745         -59.76136         -38.91753
## 1.39-6.61          -62.23913         -64.56627         -48.92381
## 1.52-9.32          -63.35922         -67.48913         -50.04167
##            00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0.05             -63.04301         -55.23333
## 0.15-9.42          -69.45455         -46.88000
## 0.31-11.09         -69.13158         -53.88660
## 0.47-8.2           -60.79747         -49.58000
## 0.78-10.94         -65.00000         -54.84694
## 0.93-11.69         -65.59302         -47.27083
## 1.08-12.19         -71.58696         -51.66667
## 1.24-3.93          -71.66667         -53.23333
## 1.39-6.61          -60.79798         -50.49057
## 1.52-9.32          -65.10345         -49.38542

Function (Select Train Data)

#' @description Selectes the appropriate observations (based on test data orientation) from the original tall data
#' and reformats it such that it can be used for training the KNN algorithm
#' @param angleNewObs Angle (Orientation) of the test observation
#' @param train_data Original tall-skinny data
#' @param m Keep the 'm' closest orientations to angleNewObs 
#' @returns A dataframe suitable for training
selectTrain = function(angleNewObs, train_data, m){
  
  # Find the angles to keep
  
  nearestAngle = roundOrientation(angles = angleNewObs)
  
  if (m %% 2 == 1) {
    angles = seq(-45 * (m - 1) /2, 45 * (m - 1) /2, length = m)
  } else {
    m = m + 1
    angles = seq(-45 * (m - 1) /2, 45 * (m - 1) /2, length = m)
    if (sign(angleNewObs - nearestAngle) > -1)
      angles = angles[ -1 ]
    else
      angles = angles[ -m ]
  }
  
  angles = angles + nearestAngle
  angles[angles < 0] = angles[ angles < 0 ] + 360
  angles[angles > 360] = angles[ angles > 360 ] - 360
  
  # Subset only those angles from original data (tall-skinny)
  train_data_subset = train_data[train_data$angle %in% angles, ]
  
  # Convert to Wide and average the data for the same positions 
  train_data_subset = reshapeSS(data = train_data_subset, varSignal = "signal")
  
  return(train_data_subset)
}

Nearest Neighbors

Common Functions

#' @description Computes the distance of the new signal (single observation) to each observation in the training dataset
#' @param newSignals The Signal Values for the validation data for each observation
#' @param trainSubset The training data to be used
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A dataframe containing same number of rows as that in the training data.
#'         The observations are ordered by the distance to the new signal. Each row contains 5 columns. 
#'         1st column is the XY location of the training observation (string)
#'         2nd column is the X location of the training observation (float)
#'         3rd column is the Y location of the training observation (float)
#'         4th column is the distance to the point under consideration to the training observation (float)
#'         5th column is the inverse distance or weight (float). Weight is hard coded to 1 for all observations if weighted = FALSE
findNN = function(newSignal, trainSubset, weighted=FALSE, method = DIST_METHOD) {
  diffs = apply(trainSubset[ , 4:(4+numMacs-1)], 1, function(x) x - newSignal)
  if(method=='Euclidean')  dists = apply(diffs, 2, function(x) sqrt(sum(x^2)) ) #RSE
  if(method=='Manhattan')  dists = apply(diffs, 2, function(x) sum(abs(x)) ) #AE
  closest = order(dists)
  
  ordered_dist = dists[closest]
  if(weighted == TRUE){
    weight = weightFormula(ordered_dist)
  }
  if(weighted == FALSE){
    weight = rep(1, length(dists))
  }
  return(cbind(trainSubset[closest, 1:3], ordered_dist, weight))
}
#' @description XY Prediction for a single value of k (num neighbors)
#' @param newSignals The Signal Values for the validation data for each observation
#' @param newAngles The Orientation of the validation data for each observation
#' @param trainData The training data to be used
#' @param numAngles Number of closest reference angles to include in the data
#' @param k Perform the predicton for num neighbors = k
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A dataframe with num rows = number of (validation) observations and num columns = 2
#'         Each row indicates the prediction of the mean X and Y values for that observation
predXY = function(newSignals, newAngles, trainData, numAngles = 1, k = 3, weighted=FALSE){
  closeXY = list(length = nrow(newSignals))
  for (i in 1:nrow(newSignals)) {
    trainSS = selectTrain(newAngles[i], trainData, m = numAngles)
    closeXY[[i]] = findNN(
      newSignal = as.numeric(newSignals[i, ]),
      trainSubset = trainSS,
      weighted = weighted
    )
  }
  
  #' @description Returns the (un)weighted mean X and Y locations for a single observation and single value of neighbors
  #' @param x Dataframe containing 5 columns 
  #' 1st column is the XY location (string)
  #' 2nd column is the X location (float)
  #' 3rd column is the Y location (float)
  #' 4th column is the distance to the point under consideration (float)
  #' 5th column is the inverse distance or weight (float)
  #' @param k Number of nearest neighbors to use
  #' @return A pair of XY mean values for k number of neighbors
  k_means_single_obs = function(x, k){
    weights = x[1:k, 5]
    weighted_x = sum(x[1:k, 2] * weights) / sum(weights)
    weighted_y = sum(x[1:k, 3] * weights) / sum(weights)
    return(c(weighted_x, weighted_y))
  }
  
  # estXY = lapply(closeXY, function(x) sapply(x[ , 2:3], function(x) mean(x[1:k])))
  estXY = lapply(closeXY, k_means_single_obs, k)
  estXY = do.call("rbind", estXY)
  return(estXY)
}
calcError = function(estXY, actualXY, method = DIST_METHOD){
  if('numeric' %in% class(estXY)) rows = 1 else rows = nrow(estXY)
  if(method == 'Euclidean')  er = sqrt(sum(rowSums((estXY - actualXY)^2)))/rows
  if(method == 'Manhattan')  er = sum(rowSums(abs(estXY - actualXY)))/rows
  return(er)
}

K-Fold

Setup

set.seed(42)
K = 20
v = 11
allNeighbors = c(1:K)
allNeighbors
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
allAngles = c(1:8)
allAngles
## [1] 1 2 3 4 5 6 7 8
permuteLocs = sample(unique(offline$posXY))
permuteLocs = matrix(permuteLocs, ncol = v, nrow = floor(length(permuteLocs)/v))
## Warning in matrix(permuteLocs, ncol = v, nrow = floor(length(permuteLocs)/v)):
## data length [166] is not a sub-multiple or multiple of the number of rows [15]
permuteLocs
##       [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]   [,8]   [,9]   [,10] 
##  [1,] "11-3" "24-6" "27-8" "25-8" "7-7"  "24-8" "22-8" "12-5" "0-12" "0-7" 
##  [2,] "9-7"  "21-6" "2-9"  "24-7" "5-8"  "15-8" "11-8" "32-6" "2-7"  "29-8"
##  [3,] "14-7" "30-3" "12-7" "23-4" "2-2"  "10-5" "4-7"  "17-8" "0-11" "17-7"
##  [4,] "13-8" "3-3"  "32-8" "30-7" "2-8"  "33-7" "25-3" "1-13" "12-6" "24-3"
##  [5,] "33-8" "15-7" "30-8" "0-3"  "20-3" "21-3" "2-11" "13-3" "8-8"  "22-3"
##  [6,] "21-8" "0-8"  "0-1"  "2-13" "1-5"  "0-4"  "0-0"  "23-3" "1-10" "16-8"
##  [7,] "20-8" "24-5" "26-3" "31-8" "31-3" "10-4" "3-8"  "2-5"  "8-7"  "0-9" 
##  [8,] "27-3" "1-1"  "32-3" "2-6"  "13-7" "11-4" "26-6" "8-3"  "26-4" "1-3" 
##  [9,] "9-3"  "19-3" "2-1"  "14-8" "33-4" "6-8"  "5-7"  "1-4"  "13-6" "4-8" 
## [10,] "1-12" "25-4" "20-7" "24-4" "26-7" "0-13" "22-4" "19-7" "33-3" "18-8"
## [11,] "10-8" "2-0"  "1-0"  "16-7" "7-8"  "3-7"  "6-7"  "26-8" "2-4"  "32-5"
## [12,] "21-4" "11-5" "21-7" "27-7" "32-7" "13-5" "15-3" "9-4"  "7-3"  "22-7"
## [13,] "16-3" "4-3"  "28-8" "21-5" "1-6"  "22-5" "19-8" "10-7" "1-11" "25-7"
## [14,] "26-5" "18-7" "23-7" "12-8" "29-3" "23-5" "13-4" "2-10" "2-3"  "10-3"
## [15,] "1-8"  "5-3"  "1-9"  "0-10" "12-4" "31-7" "28-3" "11-7" "23-6" "28-7"
##       [,11] 
##  [1,] "10-6"
##  [2,] "1-7" 
##  [3,] "23-8"
##  [4,] "11-6"
##  [5,] "14-3"
##  [6,] "12-3"
##  [7,] "29-7"
##  [8,] "22-6"
##  [9,] "18-3"
## [10,] "17-3"
## [11,] "9-8" 
## [12,] "6-3" 
## [13,] "32-4"
## [14,] "0-2" 
## [15,] "1-2"
onlineFold = subset(offline, posXY %in% permuteLocs[ , 1])
head(onlineFold)
##                time posX posY orientation               mac signal    channel
## 154732 1.139653e+12    1    8         0.7 00:14:bf:b1:97:8a    -47 2437000000
## 154733 1.139653e+12    1    8         0.7 00:14:bf:b1:97:8d    -52 2442000000
## 154734 1.139653e+12    1    8         0.7 00:0f:a3:39:e1:c0    -52 2462000000
## 154735 1.139653e+12    1    8         0.7 00:14:bf:b1:97:90    -51 2427000000
## 154736 1.139653e+12    1    8         0.7 00:14:bf:3b:c7:c6    -68 2432000000
## 154737 1.139653e+12    1    8         0.7 00:14:bf:b1:97:81    -67 2422000000
##        type angle posXY
## 154732    3     0   1-8
## 154733    3     0   1-8
## 154734    3     0   1-8
## 154735    3     0   1-8
## 154736    3     0   1-8
## 154737    3     0   1-8
# For reference
head(onlineSummary)
##                 posXY posX  posY orientation angle 00:0f:a3:39:e1:c0
## 0-0.05         0-0.05 0.00  0.05       130.5   135         -52.22727
## 0.15-9.42   0.15-9.42 0.15  9.42       112.3    90         -55.27523
## 0.31-11.09 0.31-11.09 0.31 11.09       230.1   225         -51.70909
## 0.47-8.2     0.47-8.2 0.47  8.20         5.8     0         -49.50000
## 0.78-10.94 0.78-10.94 0.78 10.94       348.3     0         -53.26364
## 0.93-11.69 0.93-11.69 0.93 11.69       158.3   180         -57.96364
##            00:14:bf:3b:c7:c6 00:14:bf:b1:97:81 00:14:bf:b1:97:8a
## 0-0.05             -62.94898         -61.81395         -40.06897
## 0.15-9.42          -73.96190         -72.70103         -47.81308
## 0.31-11.09         -70.08247         -70.09890         -54.08824
## 0.47-8.2           -64.25806         -72.59770         -45.65289
## 0.78-10.94         -66.96000         -66.80952         -48.41379
## 0.93-11.69         -70.44340         -70.58025         -43.66346
##            00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0.05             -63.04301         -55.23333
## 0.15-9.42          -69.45455         -46.88000
## 0.31-11.09         -69.13158         -53.88660
## 0.47-8.2           -60.79747         -49.58000
## 0.78-10.94         -65.00000         -54.84694
## 0.93-11.69         -65.59302         -47.27083
keepVars = c("posXY", "posX","posY", "orientation", "angle")
onlineCVSummary = reshapeSS(offline, keepVars = keepVars, sampleAngle = TRUE)
head(onlineCVSummary)
##      posXY posX posY orientation angle 00:0f:a3:39:e1:c0 00:14:bf:3b:c7:c6
## 0-0    0-0    0    0        90.3    90         -50.58559         -62.59551
## 0-1    0-1    0    1        89.8    90         -55.17273         -65.70000
## 0-10  0-10    0   10       225.3   225         -51.47706         -63.01163
## 0-11  0-11    0   11       135.2   135         -50.16364         -67.87778
## 0-12  0-12    0   12       135.0   135         -52.52727         -68.90816
## 0-13  0-13    0   13       135.2   135         -54.91818         -71.76190
##      00:14:bf:b1:97:81 00:14:bf:b1:97:8a 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0          -63.78261         -33.74737         -63.12941         -55.19588
## 0-1          -63.94186         -40.21782         -63.52381         -60.47826
## 0-10         -68.58416         -49.01010         -66.71111         -54.56180
## 0-11         -72.34000         -47.57895         -66.17241         -53.97000
## 0-12         -70.32222         -43.11009         -63.73494         -48.99000
## 0-13         -74.19588         -42.49438         -67.70423         -50.82828
# First Fold (validation)
onlineFold = subset(onlineCVSummary, posXY %in% permuteLocs[ , 1])
head(onlineCVSummary,01)
##     posXY posX posY orientation angle 00:0f:a3:39:e1:c0 00:14:bf:3b:c7:c6
## 0-0   0-0    0    0        90.3    90         -50.58559         -62.59551
##     00:14:bf:b1:97:81 00:14:bf:b1:97:8a 00:14:bf:b1:97:8d 00:14:bf:b1:97:90
## 0-0         -63.78261         -33.74737         -63.12941         -55.19588
# First Fold (Train)
offlineFold = subset(offline, posXY %in% permuteLocs[ , -1])
head(offlineFold)
##           time posX posY orientation               mac signal    channel type
## 1 1.139643e+12    0    0           0 00:14:bf:b1:97:8a    -38 2437000000    3
## 2 1.139643e+12    0    0           0 00:14:bf:b1:97:90    -56 2427000000    3
## 3 1.139643e+12    0    0           0 00:0f:a3:39:e1:c0    -53 2462000000    3
## 4 1.139643e+12    0    0           0 00:14:bf:b1:97:8d    -65 2442000000    3
## 5 1.139643e+12    0    0           0 00:14:bf:b1:97:81    -65 2422000000    3
## 6 1.139643e+12    0    0           0 00:14:bf:3b:c7:c6    -66 2432000000    3
##   angle posXY
## 1     0   0-0
## 2     0   0-0
## 3     0   0-0
## 4     0   0-0
## 5     0   0-0
## 6     0   0-0
estFold = predXY(
  newSignals = onlineFold[ , 6:(6+numMacs-1)],
  newAngles = onlineFold[ , 4],
  offlineFold,
  numAngles = 3,
  k = 3
)
actualFold = onlineFold[ , c("posX", "posY")]
calcError(estFold, actualFold)
## [1] 2.266667

Faster Cross Validation

Common Functions

#' @description Modified XY Prediction to help with faster CV for all K values at once (from 1 to K)
#' @param newSignals The Signal Values for the validation data for each observation
#' @param newAngles The Orientation of the validation data for each observation
#' @param trainData The training data to be used
#' @param numAngles Number of closest reference angles to include in the data
#' @param K Perform the prediction for num neighbors from 1 to K
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A nested dataframe with num rows = number of (validation) observations and num columns = number of folds
#'         Each entry in this dataframe is a vector of 2 values
#'         indicating the prediction of the mean X and Y values for that obs and num neighbors
predXYallK = function(newSignals, newAngles, trainData, numAngles = 1, K = 10, weighted=FALSE){
  closeXY = list(length = nrow(newSignals))
  for (i in 1:nrow(newSignals)) {
    trainSS = selectTrain(newAngles[i], trainData, m = numAngles)
    closeXY[[i]] = findNN(
      newSignal = as.numeric(newSignals[i, ]),
      trainSubset = trainSS,
      weighted = weighted
    )
  }
  
  #' @description Returns the (un)weighted mean X and Y locations for a single observation and multiple neighor values
  #' @param x Dataframe containing 5 columns 
  #' 1st column is the XY location (string)
  #' 2nd column is the X location (float)
  #' 3rd column is the Y location (float)
  #' 4th column is the distance to the point under consideration (float)
  #' 5th column is the inverse distance or weight (float)
  #' @param K Number of nearest neighbors to use
  #' @return A list of K pairs (each pair is a XY mean value for a single k)
  all_K_means_single_obs = function(x, K){
    # Row will contain the K mean values for k = 1 to K
    rows = list()
    for(k in seq(1, K)){
      rows[[k]] = k_means_single_obs(x, k)
    }
    return(rows)
  }
  
  #' @description Returns the (un)weighted mean X and Y locations for a single observation and single value of neighbors
  #' @param x Dataframe containing 5 columns 
  #' 1st column is the XY location (string)
  #' 2nd column is the X location (float)
  #' 3rd column is the Y location (float)
  #' 4th column is the distance to the point under consideration (float)
  #' 5th column is the inverse distance or weight (float)
  #' @param k Number of nearest neighbors to use
  #' @return A pair of XY mean values for k number of neighbors
  k_means_single_obs = function(x, k){
    weights = x[1:k, 5]
    weighted_x = sum(x[1:k, 2] * weights) / sum(weights)
    weighted_y = sum(x[1:k, 3] * weights) / sum(weights)
    return(c(weighted_x, weighted_y))
  }
  
  # estXY = lapply(closeXY, function(x) sapply(x[ , 2:3], function(x) mean(x[1:k])))
  estXY = lapply(closeXY, all_K_means_single_obs, K)
  estXY = do.call("rbind", estXY)
  return(estXY)
}
#' @description Returns the (un)weighted mean X and Y locations for a single observation and multiple neighor values
#' @param K Number of nearest neighbors to use (Will run Grid Search over all values from k = 1 to K)
#' @param v Number of folds to use
#' @param offline Use "as is" from script for now
#' @param onlineCVSummary Use "as is" from script for now
#' @param folds A matrix with rows = number of observations in each fold and columns = number of folds.
#'              The values are the XY IDs to be included in that fold
#' @param numAngles Number of closest reference angles to include in the data
#' @param weighted Whether the mean value should be weighted based on distancde or not.
#' @return A vector of K values indicating the Error for each value of k from 1 to K
run_kfold = function(K, v, offline, onlineCVSummary, folds, numAngles, weighted=FALSE){
  err= rep(0, K)
  errCV = rep(0, K)
  allErr = data.frame()
  for (j in 1:v) {
    print(paste("Running Fold: ", j))
    onlineFold = subset(onlineCVSummary, posXY %in% folds[ , j])
    offlineFold = subset(offline, posXY %in% folds[ , -j])
    actualFold = onlineFold[ , c("posX", "posY")]
    
    estFold = predXYallK(
        newSignals = onlineFold[ , 6:(6+numMacs-1)],
        newAngles = onlineFold[ , 4],
        trainData = offlineFold,
        numAngles = numAngles,
        K = K,
        weighted=weighted
      )
    # Reformat into correct format for each 'k' value
    for(k in 1:K){ 
      estSingleK = data.frame()
      for(i in seq(1, length(estFold)/K)){  # i = NUmber of the observtion
        estSingleK = rbind(estSingleK, t(as.data.frame(estFold[i,k])))
      }
      err[k] = err[k] + calcError(estSingleK, actualFold)
      errCV[k] =  calcError(estSingleK, actualFold) #returning all folds
    }
    allErr=rbind(allErr,data.frame('fold'=j, 'numNeighbors' = 1:K,'errValue' = errCV))
  } 
  
  return(list(err=err,allErr=allErr))
}

Parallel CV and Plot

get_CV = function(K,v,offline,onlineCVSummary,permuteLocs,numAngles,weighted = TRUE){
  library(foreach)
  library(progress)
  library(doParallel)
  library(doSNOW)
  cl <- makeCluster(detectCores())
  doSNOW::registerDoSNOW(cl)
  
  allErrors = data.frame()
  
  start = proc.time()
  
  pb <- progress::progress_bar$new(total = length(allAngles),format='[:bar] :percent :eta')
  progress <- function(n) pb$tick()
  allErrorsCV = foreach(numAngles = allAngles
                        ,.combine = rbind
                        ,.options.snow = list(progress=progress)
                        ,.export = c('run_kfold','predXYallK','reshapeSS','findNN','calcError'
                                     ,'numMacs','selectTrain','roundOrientation','DIST_METHOD'
                                     ,'weightFormula')
  ) %dopar% {
    print(paste("Running ", v, "-fold cross validation with 1 to ", K, " neighbors, for number of Angles = ", numAngles))
    err = run_kfold(
      K = K,
      v = v,
      offline = offline,
      onlineCVSummary = onlineCVSummary,
      folds = permuteLocs,
      numAngles = numAngles,
      weighted = weighted
    )
    err$allErr$numAngles = numAngles
    
    return(err$allErr)
    #return(data.frame(t(err)))
  }
  stopCluster(cl)
  stop = proc.time()
  diff = stop-start
  print(diff)
  
  return(allErrorsCV)
}

find_best = function(allErrorsCV){
  library('caret')
  library(tidyverse)
  allErrors = allErrorsCV %>%
    group_by(numAngles,numNeighbors) %>%
    dplyr::summarise(errValue = mean(errValue)) %>%
    ungroup() %>%
    mutate(errValueSD=sd(errValue)
           ,best=FALSE
           ,oneSE=FALSE)
  allErrors[best(as.data.frame(allErrors),"errValue",maximize=FALSE),]$best=TRUE
  allErrors[oneSE(as.data.frame(allErrors),"errValue",maximize=FALSE,num=30),]$oneSE=TRUE
  return(allErrors)
}

plot_best = function(allErrors) {
  p = ggplot(allErrors, aes(x=numNeighbors, y=numAngles, fill= errValue
                         , text=paste0("A:",numAngles," N:",numNeighbors,errMethod," :", round(errValue,3)))) + 
    geom_tile() +
    scale_y_continuous(breaks=allAngles) +
    #scale_fill_distiller(palette = "RdYlBu") +
    scale_fill_gradient2(low = "green",mid='darkorange', high = "darkred", na.value = NA
                         ,midpoint=mean(c(max(allErrors$errValue),min(allErrors$errValue)))
                         #,midpoint=median(Errors$errValue)
    )+
    #scale_fill_distiller(palette = "Blues",direction=0) +
    labs(fill = errMethod) +
    geom_text(data=allErrors[allErrors$best,],label='Best',size=3,nudge_y=.27) +
    geom_text(data=allErrors[allErrors$oneSE,],label='1SE',size=3) 
  #p
  ggplotly(p, tooltip="text")
}

Unweighted

allErrorsCV = get_CV(K=K,v=v
                     ,offline=offline,onlineCVSummary=onlineCVSummary
                     ,permuteLocs=permuteLocs,numAngles=numAngles
                     ,weighted = FALSE)
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
## Loading required package: snow
## 
## Attaching package: 'snow'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, clusterSplit, makeCluster, parApply,
##     parCapply, parLapply, parRapply, parSapply, splitIndices,
##     stopCluster
##    user  system elapsed 
##    2.53    2.70  134.58
allErrors = find_best(allErrorsCV)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## `summarise()` regrouping output by 'numAngles' (override with `.groups` argument)
allErrors
## # A tibble: 160 x 6
##    numAngles numNeighbors errValue errValueSD best  oneSE
##        <int>        <int>    <dbl>      <dbl> <lgl> <lgl>
##  1         1            1     3.43      0.194 FALSE FALSE
##  2         1            2     2.95      0.194 FALSE FALSE
##  3         1            3     2.79      0.194 FALSE FALSE
##  4         1            4     2.78      0.194 FALSE FALSE
##  5         1            5     2.78      0.194 FALSE FALSE
##  6         1            6     2.81      0.194 FALSE FALSE
##  7         1            7     2.87      0.194 FALSE FALSE
##  8         1            8     2.88      0.194 FALSE FALSE
##  9         1            9     2.95      0.194 FALSE FALSE
## 10         1           10     2.97      0.194 FALSE FALSE
## # ... with 150 more rows
print(filter(allErrors,best | oneSE))
## # A tibble: 1 x 6
##   numAngles numNeighbors errValue errValueSD best  oneSE
##       <int>        <int>    <dbl>      <dbl> <lgl> <lgl>
## 1         6            2     2.70      0.194 TRUE  TRUE
plot_best(allErrors)

Final Model

final = filter(allErrors,oneSE)

finalAngle = final$numAngles
finalK = final$numNeighbors
finalAngle
## [1] 6
finalK
## [1] 2
# 
# final = which(allErrors == min(allErrors), arr.ind = TRUE)
# 
# 
# finalAngleIndex = final[1]
# finalKIndex = final[2]
# finalAngle = allAngles[finalAngleIndex]
# finalK = allNeighbors[finalKIndex]
# 
# finalAngle
# finalK
actualXY = onlineSummary %>%  dplyr::select(posX, posY)

estXYfinalK = predXY(
  newSignals = onlineSummary[ , 6:(6+numMacs-1)],
  newAngles = onlineSummary[ , 4],
  trainData = offline,
  numAngles = finalAngle,
  k = finalK,
  weighted = FALSE
)
calcError(estXYfinalK, actualXY)
## [1] 2.7525

Weighted

allErrorsCVW = get_CV(K=K,v=v
                     ,offline=offline,onlineCVSummary=onlineCVSummary
                     ,permuteLocs=permuteLocs,numAngles=numAngles
                     ,weighted = TRUE)
##    user  system elapsed 
##    2.54    2.64  119.70
allErrorsW = find_best(allErrorsCVW)
## `summarise()` regrouping output by 'numAngles' (override with `.groups` argument)
allErrorsW
## # A tibble: 160 x 6
##    numAngles numNeighbors errValue errValueSD best  oneSE
##        <int>        <int>    <dbl>      <dbl> <lgl> <lgl>
##  1         1            1     3.43      0.125 FALSE FALSE
##  2         1            2     2.93      0.125 FALSE FALSE
##  3         1            3     2.77      0.125 FALSE FALSE
##  4         1            4     2.72      0.125 FALSE TRUE 
##  5         1            5     2.69      0.125 TRUE  FALSE
##  6         1            6     2.70      0.125 FALSE FALSE
##  7         1            7     2.73      0.125 FALSE FALSE
##  8         1            8     2.74      0.125 FALSE FALSE
##  9         1            9     2.76      0.125 FALSE FALSE
## 10         1           10     2.78      0.125 FALSE FALSE
## # ... with 150 more rows
print(filter(allErrorsW,best | oneSE))
## # A tibble: 2 x 6
##   numAngles numNeighbors errValue errValueSD best  oneSE
##       <int>        <int>    <dbl>      <dbl> <lgl> <lgl>
## 1         1            4     2.72      0.125 FALSE TRUE 
## 2         1            5     2.69      0.125 TRUE  FALSE
plot_best(allErrorsW)

Final Model

finalW = filter(allErrorsW,oneSE)

finalAngleW = final$numAngles
finalKW = final$numNeighbors
finalAngleW
## [1] 6
finalKW
## [1] 2
actualXY = onlineSummary %>%  dplyr::select(posX, posY)


estXYfinalKW = predXY(
  newSignals = onlineSummary[ , 6:(6+numMacs-1)],
  newAngles = onlineSummary[ , 4],
  trainData = offline,
  numAngles = finalAngleW,
  k = finalKW,
  weighted = TRUE
)
calcError(estXYfinalKW, actualXY)
## [1] 2.732295